Data import

Import data in list via function Calll to import_csv_data().

data <- import_csv_data(path = "Olist_data/")
data_marketing <- import_csv_data(path = "Olist_Marketing_data/")

Overview of data

Source https://www.kaggle.com/jungjoonlee/eda-with-ecommerce-marketplace-seller-side

Marketing Qualified Lead (MQL) means a potential reseller/manufacturer who has an interest in selling their products on Olist. After a MQL filled a form on landing page to sign up for seller, a Sales Development Representative(SDR) contacted the MQL and gathered more information about the lead. Then a Sales Representative(SR) consulted the MQL. So interaction between SDRs/SRs and MQLs can affect conversion from MQLs to sellers. A MQL who finally signed up for seller is called a closed deal.

Closed Deals

skim(data_marketing$olist_closed_deals_dataset)
Data summary
Name data_marketing$olist_clos…
Number of rows 842
Number of columns 14
_______________________
Column type frequency:
factor 12
numeric 2
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
mql_id 0 1 FALSE 842 000: 1, 009: 1, 00d: 1, 010: 1
seller_id 0 1 FALSE 842 000: 1, 012: 1, 013: 1, 01f: 1
sdr_id 0 1 FALSE 32 4b3: 140, 068: 81, 56b: 74, 9d1: 66
sr_id 0 1 FALSE 22 4ef: 133, d3d: 82, 656: 74, 85f: 64
won_date 0 1 FALSE 824 201: 6, 201: 4, 201: 3, 201: 3
business_segment 0 1 FALSE 34 hom: 105, hea: 93, car: 77, hou: 71
lead_type 0 1 FALSE 9 onl: 332, onl: 126, ind: 123, off: 104
lead_behaviour_profile 0 1 FALSE 10 cat: 407, emp: 177, eag: 123, wol: 95
has_company 0 1 FALSE 3 emp: 779, Tru: 58, Fal: 5
has_gtin 0 1 FALSE 3 emp: 778, Tru: 54, Fal: 10
average_stock 0 1 FALSE 7 emp: 776, 5-2: 22, 50-: 15, 1-5: 10
business_type 0 1 FALSE 4 res: 587, man: 242, emp: 10, oth: 3

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
declared_product_catalog_size 773 0.08 233.03 352.38 1 30 100 300 2e+03 ▇▁▁▁▁
declared_monthly_revenue 0 1.00 73377.68 1744799.18 0 0 0 0 5e+07 ▇▁▁▁▁
head(data_marketing$olist_closed_deals_dataset) %>% datatable(., rownames = FALSE, options = list(scrollX = TRUE))

MQL

skim(data_marketing$olist_marketing_qualified_leads_dataset)
Data summary
Name data_marketing$olist_mark…
Number of rows 8000
Number of columns 4
_______________________
Column type frequency:
factor 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
mql_id 0 1 FALSE 8000 000: 1, 000: 1, 001: 1, 002: 1
first_contact_date 0 1 FALSE 336 201: 93, 201: 76, 201: 75, 201: 71
landing_page_id 0 1 FALSE 495 b76: 912, 22c: 883, 583: 495, 887: 445
origin 0 1 FALSE 11 org: 2296, pai: 1586, soc: 1350, unk: 1099
head(data_marketing$olist_marketing_qualified_leads_dataset) %>% datatable(., rownames = FALSE, options = list(scrollX = TRUE))

EDA Closed Deals

Data wrangling

# merge dataframes
df_deals <- data_marketing$olist_closed_deals_dataset %>% 
  right_join(data_marketing$olist_marketing_qualified_leads_dataset, by = "mql_id")

# createa monthly dataframe for deals by origin by first contact date
df_deals_monthly <- df_deals %>% 
  mutate(first_contact_month = format.Date(as_date(first_contact_date), "%Y%m"),
         deal_closed = if_else(is.na(seller_id), 0, 1),
         origin = if_else(origin == "", "unknown", as.character(origin))) %>% 
  group_by(origin, first_contact_month) %>% 
  summarize(leads = n(),
            closed_deals = sum(deal_closed)) %>% 
  ungroup()

Analysis Leads

Leads Overall

plot_leads <- df_deals_monthly %>% 
  group_by(first_contact_month) %>% 
  summarize(leads = sum(leads)) %>% 
  ggplot(., aes(x = first_contact_month, y = leads, group = 1)) +
  geom_line() +
  geom_point() +
  theme_bw() +
  labs(x = "", y = "Number of Leads", colour = "origin")
ggplotly(plot_leads)
#plot_leads

Leads by Origin

plot_leads_origin <- df_deals_monthly %>% 
  select(-closed_deals) %>% 
  pivot_wider(., names_from = origin, values_from = leads) %>% 
  replace(is.na(.), 0) %>% 
  ggplot(., aes(x = first_contact_month, group = 1)) +
  geom_line(aes(y = direct_traffic, color = "direct_traffic")) +
  geom_line(aes(y = display, color = "display"))+
  geom_line(aes(y = email, color = "email")) +
  geom_line(aes(y = organic_search, color = "organic_search")) +
  geom_line(aes(y = paid_search, color = "paid_search")) +
  geom_line(aes(y = referral, color = "referral")) +
  geom_line(aes(y = social, color = "social")) +
  geom_line(aes(y = other_publicities, color = "other_publicities")) +
  geom_line(aes(y = other, color = "other")) +
  geom_line(aes(y = unknown, color = "unknown")) +
  geom_point(aes(y = direct_traffic, color = "direct_traffic")) +
  geom_point(aes(y = display, color = "display"))+
  geom_point(aes(y = email, color = "email")) +
  geom_point(aes(y = organic_search, color = "organic_search")) +
  geom_point(aes(y = paid_search, color = "paid_search")) +
  geom_point(aes(y = referral, color = "referral")) +
  geom_point(aes(y = social, color = "social")) +
  geom_point(aes(y = other_publicities, color = "other_publicities")) +
  geom_point(aes(y = other, color = "other")) +
  geom_point(aes(y = unknown, color = "unknown")) +
  theme_bw() +
  labs(x = "", y = "Number of Leads", colour = "origin")
ggplotly(plot_leads_origin)
#plot_leads_origin
plot_bar_leads_origin <- df_deals_monthly %>% 
  select(-closed_deals) %>% 
  group_by(origin) %>% 
  summarize(leads = sum(leads)) %>% 
  ggplot(., aes(x = reorder(origin, -leads), y = leads, group = 1)) +
  geom_col(aes(fill = origin))+
  geom_text(aes(label = ..y..), vjust = -1) +
  theme_bw() +
  labs(x = "", y = "Number of Leads", colour = "origin") +
  scale_y_continuous(limits = c(0, 2350))
ggplotly(plot_bar_leads_origin)
#plot_bar_leads_origin
plot_bar_leads_origin_per <- df_deals_monthly %>% 
  select(-closed_deals) %>% 
  group_by(origin) %>% 
  summarize(leads = sum(leads)) %>% 
  ggplot(., aes(x = reorder(origin, -leads), y = leads / sum(leads), group = 1)) +
  geom_col(aes(fill = origin))+
  geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
  theme_bw() +
  labs(x = "", y = "Number of Leads", colour = "origin") +
  scale_y_continuous(labels=scales::percent, limits = c(0, 0.3)) 
ggplotly(plot_bar_leads_origin_per)
#plot_bar_leads_origin_per

Analysis Closed Deals

Closed Deals Overall

plot_leads <- df_deals_monthly %>% 
  group_by(first_contact_month) %>% 
  summarize(leads = sum(closed_deals)) %>% 
  ggplot(., aes(x = first_contact_month, y = leads, group = 1)) +
  geom_line() +
  geom_point() +
  theme_bw() +
  labs(x = "", y = "Number of Closed Deals", colour = "origin")
ggplotly(plot_leads)
# plot_leads

Leads by Origin

plot_leads_origin <- df_deals_monthly %>% 
  select(-leads) %>% 
  pivot_wider(., names_from = origin, values_from = closed_deals) %>% 
  replace(is.na(.), 0) %>% 
  ggplot(., aes(x = first_contact_month, group = 1)) +
  geom_line(aes(y = direct_traffic, color = "direct_traffic")) +
  geom_line(aes(y = display, color = "display"))+
  geom_line(aes(y = email, color = "email")) +
  geom_line(aes(y = organic_search, color = "organic_search")) +
  geom_line(aes(y = paid_search, color = "paid_search")) +
  geom_line(aes(y = referral, color = "referral")) +
  geom_line(aes(y = social, color = "social")) +
  geom_line(aes(y = other_publicities, color = "other_publicities")) +
  geom_line(aes(y = other, color = "other")) +
  geom_line(aes(y = unknown, color = "unknown")) +
  geom_point(aes(y = direct_traffic, color = "direct_traffic")) +
  geom_point(aes(y = display, color = "display"))+
  geom_point(aes(y = email, color = "email")) +
  geom_point(aes(y = organic_search, color = "organic_search")) +
  geom_point(aes(y = paid_search, color = "paid_search")) +
  geom_point(aes(y = referral, color = "referral")) +
  geom_point(aes(y = social, color = "social")) +
  geom_point(aes(y = other_publicities, color = "other_publicities")) +
  geom_point(aes(y = other, color = "other")) +
  geom_point(aes(y = unknown, color = "unknown")) +
  theme_bw() +
  labs(x = "", y = "Number of Closed Deals", colour = "origin")
ggplotly(plot_leads_origin)
# plot_leads_origin
plot_bar_leads_origin <- df_deals_monthly %>% 
  select(-leads) %>% 
  group_by(origin) %>% 
  summarize(leads = sum(closed_deals)) %>% 
  ggplot(., aes(x = reorder(origin, -leads), y = leads, group = 1)) +
  geom_col(aes(fill = origin))+
  geom_text(aes(label = ..y..), vjust = -1) +
  theme_bw() +
  labs(x = "", y = "Number of Closed Deals", colour = "origin") +
  scale_y_continuous(limits = c(0, 300))
ggplotly(plot_bar_leads_origin)
# plot_bar_leads_origin
plot_bar_leads_origin_per <- df_deals_monthly %>% 
  select(-leads) %>% 
  group_by(origin) %>% 
  summarize(closed_deals = sum(closed_deals)) %>% 
  ggplot(., aes(x = reorder(origin, -closed_deals), y = closed_deals / sum(closed_deals), group = 1)) +
  geom_col(aes(fill = origin))+
  geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
  theme_bw() +
  labs(x = "", y = "Number of Closed Deals", colour = "origin") +
  scale_y_continuous(labels=scales::percent, limits = c(0, 0.35)) 
ggplotly(plot_bar_leads_origin_per)
# plot_bar_leads_origin_per

Conversion Rate

Closed Deals Overall

plot_conversion <- df_deals_monthly %>% 
  group_by(first_contact_month) %>% 
  summarize(leads = sum(leads),
            closed_deals = sum(closed_deals)) %>% 
  mutate(conversion_rate = closed_deals / leads) %>% 
  ggplot(., aes(x = first_contact_month, y = conversion_rate, group = 1)) +
  geom_line() +
  geom_point() +
  theme_bw() +
  labs(x = "", y = "Conversion Rate", colour = "origin") +
  scale_y_continuous(labels=scales::percent) 
ggplotly(plot_conversion)
# plot_conversion

Conversion Rate by Origin

2017

plot_bar_conversion_origin_2017 <- df_deals_monthly %>% 
  filter(substr(first_contact_month,1,4) == "2017") %>% 
  group_by(origin) %>% 
  summarize(leads = sum(leads),
            closed_deals = sum(closed_deals)) %>% 
  mutate(conversion_rate = closed_deals / leads) %>% 
  ggplot(., aes(x = reorder(origin, -conversion_rate), y = conversion_rate, group = 1)) +
  geom_col(aes(fill = origin))+
  geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
  theme_bw() +
  labs(x = "", y = "Conversion Rate", colour = "origin") +
  scale_y_continuous(labels=scales::percent, limits = c(0, 0.20))
ggplotly(plot_bar_conversion_origin_2017)
# plot_bar_conversion_origin_2017

2018

plot_bar_conversion_origin_2018 <- df_deals_monthly %>% 
  filter(substr(first_contact_month,1,4) == "2018") %>% 
  group_by(origin) %>% 
  summarize(leads = sum(leads),
            closed_deals = sum(closed_deals)) %>% 
  mutate(conversion_rate = closed_deals / leads) %>% 
  ggplot(., aes(x = reorder(origin, -conversion_rate), y = conversion_rate, group = 1)) +
  geom_col(aes(fill = origin))+
  geom_text(aes(label = paste0(round(..y.. * 100,1), "%")), vjust = -1) +
  theme_bw() +
  labs(x = "", y = "Conversion Rate", colour = "origin") +
  scale_y_continuous(labels=scales::percent, limits = c(0, 0.30))
ggplotly(plot_bar_conversion_origin_2018)
# plot_bar_conversion_origin_2018